library("SummarizedExperiment")
## Loading required package: GenomicRanges
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
##     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
##     union, unique, unsplit, which, which.max, which.min
## Loading required package: S4Vectors
## 
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:base':
## 
##     expand.grid
## Loading required package: IRanges
## Loading required package: GenomeInfoDb
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## Loading required package: DelayedArray
## Loading required package: matrixStats
## 
## Attaching package: 'matrixStats'
## The following objects are masked from 'package:Biobase':
## 
##     anyMissing, rowMedians
## 
## Attaching package: 'DelayedArray'
## The following objects are masked from 'package:matrixStats':
## 
##     colMaxs, colMins, colRanges, rowMaxs, rowMins, rowRanges
## The following objects are masked from 'package:base':
## 
##     aperm, apply, rowsum
library("readODS")
library("scatterD3")
library(plotly)
## Loading required package: ggplot2
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:IRanges':
## 
##     slice
## The following object is masked from 'package:S4Vectors':
## 
##     rename
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library (DESeq2)

DESeq2Normalization <- function(sExpr, iqr.cutoff=0.1, plot=c("2d", "3d")) {
  ###########################################################################
  ## PART 0. Check function arguments
  ###########################################################################
  #fun.main <- deparse(match.call()[[1]])
  #  .stopIfNotExpressionSetOrSummarizedExperiment(inputObj, 'inputObj', fun.main)
  #  .stopIfNotNumeric0to1(iqr.cutoff, 'min.diff.cutoff', fun.main)
  #  sExpr <- .tryMakeSummarizedExperimentFromExpressionSet(inputObj)
  
  ###########################################################################
  ## PART I. Filter samples according to phenoData
  ###########################################################################
  ## o phenoData table contains which samples should be used in the analysis
  ##    o the samples which should be used in the analysis will have an
  ##     assigned category sExpr@phenoData$category, as "standard" or "test"
  ##    o non-assigned samples with NA values will be ignored
  ## o NOTE
  ##   assigning NA values to samples is an easy way to eliminate samples
  ##   from the analysis, without having to remove them from all input tables
  ##   (eg removing from sExpr, pdata, calls)
  phenoData <- colData(sExpr)
  ## filter out samples with missing category and/or general cell type
  phenoData.sel <- .filterPheno(phenoData, fun.main, "na")
  
  ############################################################################
  ## PART II. Filter genes in dataset
  ############################################################################
  ## A. For the cosine scoring, keep only the samples in the selected category
  ## (either 'standard' or 'test')

  #if (sum(phenoData.sel) == 0)
  #  stop(paste("No samples in selected category found, exiting function",
  #             fun.main))

  sExpr <- sExpr[, rownames(phenoData.sel)]

  # remove genes without any counts
  sExpr <- sExpr[rowSums(assay(sExpr, "counts")) > 0 , ]

  # Create DESeq.ds from the summarizedExperiment object
  DESeq.ds <- DESeqDataSetFromMatrix(countData = (assay(sExpr, "counts") + 1),
    colData = colData(sExpr),
    rowData = rowData(sExpr),
    design = ~cell_type)

  # DESeq2 Default normalization method
  DESeq.dsDefault <- estimateSizeFactors(DESeq.ds)
  counts.sf_normalized <- counts(DESeq.dsDefault, normalized = TRUE)
  log.norm.counts <- log2(counts.sf_normalized + 1)

  log.norm.counts
}

#TPMNormalization <- function(sExpr, iqr.cutoff=0.1, plot=c("2d", "3d")) {
#  ###########################################################################
#  ## PART 0. Check function arguments
#  ###########################################################################
#  #fun.main <- deparse(match.call()[[1]])
#  #  .stopIfNotExpressionSetOrSummarizedExperiment(inputObj, 'inputObj', fun.main)
#  #  .stopIfNotNumeric0to1(iqr.cutoff, 'min.diff.cutoff', fun.main)
#  #  sExpr <- .tryMakeSummarizedExperimentFromExpressionSet(inputObj)
#  
#  ###########################################################################
#  ## PART I. Filter samples according to phenoData
#  ###########################################################################
#  ## o phenoData table contains which samples should be used in the analysis
#  ##    o the samples which should be used in the analysis will have an
#  ##     assigned category sExpr@phenoData$category, as "standard" or "test"
#  ##    o non-assigned samples with NA values will be ignored
#  ## o NOTE
#  ##   assigning NA values to samples is an easy way to eliminate samples
#  ##   from the analysis, without having to remove them from all input tables
#  ##   (eg removing from sExpr, pdata, calls)
#  phenoData <- colData(sExpr)
#  ## filter out samples with missing category and/or general cell type
#  phenoData.sel <- .filterPheno(phenoData, fun.main, "na")
#  
#  ############################################################################
#  ## PART II. Filter genes in dataset
#  ############################################################################
#  ## A. For the cosine scoring, keep only the samples in the selected category
#  ## (either 'standard' or 'test')
#
#  #if (sum(phenoData.sel) == 0)
#  #  stop(paste("No samples in selected category found, exiting function",
#  #             fun.main))
#
#  sExpr <- sExpr[, rownames(phenoData.sel)]
#
#  # remove genes without any counts
#  sExpr <- sExpr[rowSums(assay(sExpr, "counts")) > 0 , ]
#
#  # 
#}



PcaPlot <- function(normalizedCount, pdata, colorVar, 
  symbolVar, plot = c("3d", "2d")) {
  ############################################################################
  ## PART III. PCA Analysis of the filtered data
  ############################################################################
  ##  1. Euclidean distance between categories
  ##  2. Cosine similarity between categories
  ## -DONT USE-3. cosine "score" between categories
  ##      o 0 < cosine score <1 where range is set to 1-min(cosine.similarity)
  ## What is this good for?
  ##   a. Visualization and exploratory data analysis
  ##   b. Generates values to be used for cell scoring
  
  ## Remove any rows that have zero variance
  pca <- prcomp(t(normalizedCount), scale=TRUE)
  
  ## Get proportion of variance explained by PC1 and PC2
  pca.sum <- summary(pca)
  pc1 <- pca.sum$importance[2,1] * 100
  pc2 <- pca.sum$importance[2,2] * 100
  pc3 <- pca.sum$importance[2,3] * 100
  
  print(pc1)
  print(pc2)
  print(pc3)
    
  ## Extract coordinates from pca object
  pca.comp <- as.data.frame(pca$x[, c(1, 2, 3)])
  pca.comp <- cbind(pca.comp,
    cell_type = pdata[, "cell_type"], 
    category = pdata[, "category"])
  
  tooltips <- paste(" <strong>", rownames(pca.comp),"</strong><br />", pca.comp$cell_type)

  if (plot == "2d") {
    ## 2D plot
    scatterD3(data = pca.comp, x=PC1, y=PC2, tooltip_text = tooltips,
            col_var=cell_type, symbol_var=category)
  } else {
    ## 3D plot
    fig <- plot_ly(pca.comp, x = ~PC1, y = ~PC2, z = ~PC3, color = ~cell_type, symbol = ~category, symbols = c('circle','x','o'))
    fig <- fig %>% add_markers()
    fig <- fig %>% layout(scene = list(xaxis = list(title = paste('PC1')),
                                     yaxis = list(title = ('PC2')),
                                     zaxis = list(title = ('PC3'))))
  
    fig
  }
}

################################################################################
## FUNCTION: .filterPheno
################################################################################
#' From utils.R in CellScore
#' Filters samples from phenotype data with missing info
#' @param pheno a DataFrame with phenotype descriptions
#' @param calling.fun a character - name of the parent function calling this
#'   function
#' @param flag a character - type of filter, one of the following: 'na',
#'   'group', subgroup', anygroup' (the last option checks for any match in
#'   group or subgroup)
#' @param flag.value a character - cell name needed for the 'specifc' filter
#' @return filtered data.frame
#' @keywords filter
.filterPheno <- function(pheno, calling.fun,
                         flag = c("na", "group", "subgroup", "anygroup"),
                         flag.value=NULL){
    ## If filtering by (sub)groups, the flag.value must be specified
    stopifnot(flag == "na" || !is.null(flag.value))

    sel <- switch(flag,
                  na = !is.na(pheno$category) & !is.na(pheno$general_cell_type),
                  group = pheno$general_cell_type %in% flag.value,
                  subgroup = pheno$sub_cell_type1 %in% flag.value,
                  anygroup = pheno$general_cell_type %in% flag.value |
                      pheno$sub_cell_type1 %in% flag.value)

    if (sum(sel) == 0){
        print(sel)
        stop(paste("No samples left after phenotype filtering, exiting function",
                   calling.fun))
    }

    pheno[sel, ]
}

# read the file selectedDEE2SExpr.rds that stores SummarizedExperiment object
# of the count data.
selectedDEE2SExpr <- readRDS("../selectedDEE2SExpr.rds")

# normalize the count data
normalizedCount <- DESeq2Normalization(selectedDEE2SExpr)
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
#     
PcaPlot(normalizedCount, colData(selectedDEE2SExpr), plot="2d")
## [1] 21.015
## [1] 7.024
## [1] 5.963
PcaPlot(normalizedCount, colData(selectedDEE2SExpr), plot="3d")
## [1] 21.015
## [1] 7.024
## [1] 5.963
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors